1 Introduction

In this document, we will visualise the cleaned cricket

library(tidyverse)
## ── Attaching packages ────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(mcvis)

2 Loading data

clean_test_batting_career = readr::read_csv("./clean_test_batting_career.csv")
## Parsed with column specification:
## cols(
##   country = col_character(),
##   player_name = col_character(),
##   span = col_character(),
##   career_start = col_double(),
##   career_end = col_double(),
##   mat = col_double(),
##   inns = col_double(),
##   not_out = col_double(),
##   runs = col_double(),
##   hs = col_character(),
##   ave = col_double(),
##   century = col_double(),
##   half_century = col_double(),
##   ducks = col_double(),
##   fours = col_double(),
##   sixes = col_double(),
##   balls_faced = col_double(),
##   strike_rate = col_double()
## )
clean_test_batting_inngings = readr::read_csv("./clean_test_batting_inngings.csv")
## Parsed with column specification:
## cols(
##   country = col_character(),
##   player_name = col_character(),
##   runs = col_character(),
##   mins = col_double(),
##   balls_faced = col_double(),
##   fours = col_double(),
##   sixes = col_double(),
##   strike_rate = col_double(),
##   pos = col_double(),
##   dismissal = col_character(),
##   inns = col_double(),
##   opposition = col_character(),
##   ground = col_character(),
##   start_date = col_character(),
##   test_number = col_character()
## )

3 Confirm relationships

plot(clean_test_batting_career$strike_rate/100,
     clean_test_batting_career$runs/clean_test_batting_career$balls_faced)
abline(a = 0, b = 1, col = "red")

plot(clean_test_batting_career$ave,
     clean_test_batting_career$runs/(clean_test_batting_career$inns - clean_test_batting_career$not_out))
abline(a = 0, b = 1, col = "red")

4 Correlation betwen key numeric variables

Looking at the correlation plot, it is not clear as to which variable is the main culprit that causes multi-collinearity.

4.1 Corrplot

X = clean_test_batting_career %>% 
  dplyr::select_if(is.numeric) %>% 
  na.omit() %>% 
  dplyr::mutate(outs = inns - not_out) %>% 
  dplyr::select(
    -career_start, -career_end,
    -inns, -mat,
    -not_out,
    -runs) %>% 
  dplyr::mutate_all(.funs = ~ log10(. + 1L))

glimpse(X)
## Observations: 810
## Variables: 9
## $ ave          <dbl> 1.1914510, 0.8450980, 1.4410664, 1.3802112, 0.84509…
## $ century      <dbl> 0.3010300, 0.0000000, 0.3010300, 0.3010300, 0.00000…
## $ half_century <dbl> 0.0000000, 0.0000000, 0.6989700, 0.6020600, 0.00000…
## $ ducks        <dbl> 0.6989700, 0.4771213, 0.6020600, 0.6020600, 1.04139…
## $ fours        <dbl> 1.278754, 0.301030, 1.832509, 1.832509, 1.041393, 2…
## $ sixes        <dbl> 0.0000000, 0.0000000, 0.4771213, 0.8450980, 0.30103…
## $ balls_faced  <dbl> 2.628389, 2.181844, 3.199206, 2.850646, 2.294466, 3…
## $ strike_rate  <dbl> 1.588047, 1.395152, 1.650890, 1.840232, 1.724604, 1…
## $ outs         <dbl> 1.0791812, 0.8450980, 1.4313638, 1.3424227, 1.25527…
plot(X$ave, 
     X$strike_rate + X$balls_faced - X$outs - log10(100))
abline(a = 0, b = 1, col = "red")

cor(X$ave, 
    X$strike_rate + X$balls_faced - X$outs - log10(100))
## [1] 0.9935072
skimr::skim(X)
## Skim summary statistics
##  n obs: 810 
##  n variables: 9 
## 
## ── Variable type:numeric ──────────────────────────────────
##      variable missing complete   n mean   sd   p0  p25  p50  p75 p100
##           ave       0      810 810 1.3  0.29 0.3  1.1  1.36 1.52 1.81
##   balls_faced       0      810 810 3.07 0.64 1.2  2.6  3.09 3.53 4.49
##       century       0      810 810 0.29 0.42 0    0    0    0.57 1.66
##         ducks       0      810 810 0.7  0.36 0    0.48 0.7  0.95 1.64
##         fours       0      810 810 1.77 0.69 0    1.28 1.8  2.26 3.22
##  half_century       0      810 810 0.54 0.53 0    0    0.48 0.95 1.83
##          outs       0      810 810 1.45 0.46 0.3  1.12 1.46 1.8  2.44
##         sixes       0      810 810 0.63 0.54 0    0    0.6  1    2.03
##   strike_rate       0      810 810 1.63 0.14 0.97 1.57 1.65 1.72 1.95
##      hist
##  ▁▁▂▃▅▆▇▃
##  ▁▂▅▆▇▇▅▂
##  ▇▂▁▁▁▁▁▁
##  ▃▅▇▆▇▅▂▁
##  ▁▂▆▆▇▇▅▂
##  ▇▂▂▃▂▂▁▁
##  ▁▃▅▇▇▇▅▂
##  ▇▅▃▅▃▂▁▁
##  ▁▁▁▁▃▇▅▁
corrplot::corrplot.mixed(
  cor(X), 
  upper = "square")

4.2 d3heatmap

d3heatmap::d3heatmap(cor(X))

4.3 Scatter plot

X %>% 
  pairs(lower.panel = CPOP::panel_cor)

5 Linear regression

m = lm(ave ~ ., data = X)

summary(m)
## 
## Call:
## lm(formula = ave ~ ., data = X)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.038521 -0.008527 -0.001799  0.005104  0.204354 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  -1.272851   0.023695  -53.719  < 2e-16 ***
## century       0.032785   0.002999   10.934  < 2e-16 ***
## half_century  0.037008   0.003229   11.463  < 2e-16 ***
## ducks         0.006572   0.003679    1.786   0.0744 .  
## fours         0.050714   0.007768    6.528 1.18e-10 ***
## sixes         0.012127   0.002180    5.564 3.60e-08 ***
## balls_faced   0.826033   0.007218  114.448  < 2e-16 ***
## strike_rate   0.810211   0.010497   77.188  < 2e-16 ***
## outs         -0.978147   0.007212 -135.635  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01807 on 801 degrees of freedom
## Multiple R-squared:  0.996,  Adjusted R-squared:  0.996 
## F-statistic: 2.51e+04 on 8 and 801 DF,  p-value: < 2.2e-16

6 VIF

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
m %>% 
  car::vif()
##      century half_century        ducks        fours        sixes 
##     3.980770     7.191969     4.315878    70.607615     3.407500 
##  balls_faced  strike_rate         outs 
##    52.709086     5.223086    26.840815

7 mcvis

mcvis method seems to point to the runs variable as the main cause of colinearity.

mcvis_result = X %>% 
  mcvis::mcvis(standardise_method = "studentise")

mcvis_result$MC %>% round(2)
##       ave century half_century ducks fours sixes balls_faced strike_rate
## tau9 0.02    0.08         0.40  0.02  0.04  0.39        0.01        0.02
## tau8 0.05    0.04         0.34  0.16  0.06  0.11        0.04        0.14
## tau7 0.13    0.09         0.19  0.06  0.03  0.25        0.15        0.07
## tau6 0.26    0.16         0.04  0.00  0.01  0.37        0.11        0.03
## tau5 0.01    0.21         0.03  0.06  0.02  0.53        0.06        0.07
## tau4 0.03    0.14         0.76  0.01  0.01  0.01        0.01        0.02
## tau3 0.19    0.01         0.04  0.70  0.01  0.01        0.02        0.00
## tau2 0.04    0.00         0.00  0.00  0.87  0.00        0.06        0.00
## tau1 0.15    0.00         0.00  0.00  0.00  0.00        0.56        0.00
##      outs
## tau9 0.02
## tau8 0.06
## tau7 0.03
## tau6 0.01
## tau5 0.01
## tau4 0.01
## tau3 0.02
## tau2 0.02
## tau1 0.28
mcvis::ggplot_mcvis(mcvis_result)

8 Session Info

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] car_3.0-3       carData_3.0-2   mcvis_0.2.11    forcats_0.4.0  
##  [5] stringr_1.4.0   dplyr_0.8.3     purrr_0.3.2     readr_1.3.1    
##  [9] tidyr_1.0.0     tibble_2.1.3    ggplot2_3.2.1   tidyverse_1.2.1
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-141       lubridate_1.7.4    doParallel_1.0.15 
##  [4] RColorBrewer_1.1-2 httr_1.4.1         tools_3.6.1       
##  [7] backports_1.1.5    utf8_1.1.4         R6_2.4.0          
## [10] DT_0.9             rpart_4.1-15       lazyeval_0.2.2    
## [13] colorspace_1.4-1   nnet_7.3-12        withr_2.1.2       
## [16] tidyselect_0.2.5   mnormt_1.5-5       curl_4.2          
## [19] compiler_3.6.1     glmnet_2.0-18      cli_1.1.0         
## [22] rvest_0.3.4        CPOP_0.0.19        xml2_1.2.2        
## [25] labeling_0.3       d3heatmap_0.6.1.2  slam_0.1-45       
## [28] scales_1.0.0       mvtnorm_1.0-11     psych_1.8.12      
## [31] proxy_0.4-23       digest_0.6.21      foreign_0.8-72    
## [34] rmarkdown_1.16     rio_0.5.16         base64enc_0.1-3   
## [37] pkgconfig_2.0.3    htmltools_0.4.0    htmlwidgets_1.5.1 
## [40] rlang_0.4.0        readxl_1.3.1       rstudioapi_0.10   
## [43] visNetwork_2.0.8   generics_0.0.2     jsonlite_1.6      
## [46] zip_2.0.4          ModelMetrics_1.2.2 magrittr_1.5      
## [49] Matrix_1.2-17      Rcpp_1.0.2         munsell_0.5.0     
## [52] fansi_0.4.0        abind_1.4-5        lifecycle_0.1.0   
## [55] stringi_1.4.3      yaml_2.2.0         MASS_7.3-51.4     
## [58] plyr_1.8.4         recipes_0.1.7      grid_3.6.1        
## [61] parallel_3.6.1     HDCI_1.0-2         crayon_1.3.4      
## [64] lattice_0.20-38    splines_3.6.1      haven_2.1.1       
## [67] hms_0.5.1          zeallot_0.1.0      knitr_1.25        
## [70] pillar_1.4.2       igraph_1.2.4.1     stats4_3.6.1      
## [73] reshape2_1.4.3     codetools_0.2-16   glue_1.3.1        
## [76] evaluate_0.14      data.table_1.12.4  modelr_0.1.5      
## [79] png_0.1-7          vctrs_0.2.0        foreach_1.4.7     
## [82] cellranger_1.1.0   gtable_0.3.0       assertthat_0.2.1  
## [85] openxlsx_4.1.0.1   xfun_0.10          gower_0.2.1       
## [88] prodlim_2018.04.18 skimr_1.0.7        broom_0.5.2       
## [91] e1071_1.7-2        class_7.3-15       survival_2.44-1.1 
## [94] timeDate_3043.102  iterators_1.0.12   lava_1.6.6        
## [97] corrplot_0.84      caret_6.0-84       ipred_0.9-9